1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 import java.nio.charset.*;
31 import java.nio.*;
32 import java.util.*;
33
34 public class TestUTF8 {
35 static char[] decode(byte[] bb, String csn, boolean testDirect)
36 throws Exception {
37 CharsetDecoder dec = Charset.forName(csn).newDecoder();
38 ByteBuffer bbf;
39 CharBuffer cbf;
40 if (testDirect) {
41 bbf = ByteBuffer.allocateDirect(bb.length);
42 cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
43 bbf.put(bb).flip();
44 } else {
45 bbf = ByteBuffer.wrap(bb);
46 cbf = CharBuffer.allocate(bb.length);
47 }
48 CoderResult cr = dec.decode(bbf, cbf, true);
49 if (cr != CoderResult.UNDERFLOW)
50 throw new RuntimeException("Decoding err: " + csn);
51 char[] cc = new char[cbf.position()];
52 cbf.flip(); cbf.get(cc);
53 return cc;
54
55 }
56
57 static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect)
58 throws Exception {
59 CharsetDecoder dec = Charset.forName(csn).newDecoder();
60 ByteBuffer bbf;
61 CharBuffer cbf;
62 if (testDirect) {
63 bbf = ByteBuffer.allocateDirect(bb.length);
64 cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
65 bbf.put(bb).flip();
66 } else {
67 bbf = ByteBuffer.wrap(bb);
68 cbf = CharBuffer.allocate(bb.length);
69 }
70 return dec.decode(bbf, cbf, true);
71 }
72
73
74 static char[] decode(Charset cs, byte[] ba, int off, int len) {
75 CharsetDecoder cd = cs.newDecoder();
76 int en = (int)(len * cd.maxCharsPerByte());
77 char[] ca = new char[en];
78 if (len == 0)
79 return ca;
80 cd.onMalformedInput(CodingErrorAction.REPLACE)
81 .onUnmappableCharacter(CodingErrorAction.REPLACE)
82 .reset();
83
84 ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
85 CharBuffer cb = CharBuffer.wrap(ca);
86 try {
87 CoderResult cr = cd.decode(bb, cb, true);
88 if (!cr.isUnderflow())
89 cr.throwException();
90 cr = cd.flush(cb);
91 if (!cr.isUnderflow())
92 cr.throwException();
93 } catch (CharacterCodingException x) {
94 throw new Error(x);
95 }
96 return Arrays.copyOf(ca, cb.position());
97 }
98
99 static byte[] encode(char[] cc, String csn, boolean testDirect)
100 throws Exception {
101 ByteBuffer bbf;
102 CharBuffer cbf;
103 CharsetEncoder enc = Charset.forName(csn).newEncoder();
104 if (testDirect) {
105 bbf = ByteBuffer.allocateDirect(cc.length * 4);
106 cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
107 cbf.put(cc).flip();
108 } else {
109 bbf = ByteBuffer.allocate(cc.length * 4);
110 cbf = CharBuffer.wrap(cc);
111 }
112
113 CoderResult cr = enc.encode(cbf, bbf, true);
114 if (cr != CoderResult.UNDERFLOW)
115 throw new RuntimeException("Encoding err: " + csn);
116 byte[] bb = new byte[bbf.position()];
117 bbf.flip(); bbf.get(bb);
118 return bb;
119 }
120
121 static CoderResult encodeCR(char[] cc, String csn, boolean testDirect)
122 throws Exception {
123 ByteBuffer bbf;
124 CharBuffer cbf;
125 CharsetEncoder enc = Charset.forName(csn).newEncoder();
126 if (testDirect) {
127 bbf = ByteBuffer.allocateDirect(cc.length * 4);
128 cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
129 cbf.put(cc).flip();
130 } else {
131 bbf = ByteBuffer.allocate(cc.length * 4);
132 cbf = CharBuffer.wrap(cc);
133 }
134 return enc.encode(cbf, bbf, true);
135 }
136
137 static char[] getUTFChars() {
138 char[] cc = new char[0x10000 - 0xe000 + 0xd800 +
139 (0x110000 - 0x10000) * 2];
140 int pos = 0;
141 int i = 0;
142 for (i = 0; i < 0xd800; i++)
143 cc[pos++] = (char)i;
144 for (i = 0xe000; i < 0x10000; i++)
145 cc[pos++] = (char)i;
146 for (i = 0x10000; i < 0x110000; i++) {
147 pos += Character.toChars(i, cc, pos);
148 }
149 return cc;
150 }
151
152 static int to3ByteUTF8(char c, byte[] bb, int pos) {
153 bb[pos++] = (byte)(0xe0 | ((c >> 12)));
154 bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f));
155 bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f));
156 return 3;
157 }
158
159 static void checkRoundtrip(String csn) throws Exception {
160 System.out.printf(" Check roundtrip <%s>...", csn);
161 char[] cc = getUTFChars();
162 byte[] bb = encode(cc, csn, false);
163 char[] ccO = decode(bb, csn, false);
164
165 if (!Arrays.equals(cc, ccO)) {
166 System.out.printf(" non-direct failed");
167 }
168 bb = encode(cc, csn, true);
169 ccO = decode(bb, csn, true);
170 if (!Arrays.equals(cc, ccO)) {
171 System.out.print(" (direct) failed");
172 }
173
174 if (!Arrays.equals(bb, new String(cc).getBytes(csn))) {
175 System.out.printf(" String.getBytes() failed");
176 }
177 if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
178 System.out.printf(" String.toCharArray() failed");
179 }
180 System.out.println();
181 }
182
183 static void check6ByteSurrs(String csn) throws Exception {
184 System.out.printf(" Check 6-byte Surrogates <%s>...%n", csn);
185 byte[] bb = new byte[(0x110000 - 0x10000) * 6];
186 char[] cc = new char[(0x110000 - 0x10000) * 2];
187 int bpos = 0;
188 int cpos = 0;
189 for (int i = 0x10000; i < 0x110000; i++) {
190 Character.toChars(i, cc, cpos);
191 bpos += to3ByteUTF8(cc[cpos], bb, bpos);
192 bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos);
193 cpos += 2;
194 }
195
196 char[] ccO = decode(bb, csn, false);
197 if (!Arrays.equals(cc, ccO)) {
198 System.out.printf(" decoding failed%n");
199 }
200 ccO = decode(bb, csn, true);
201 if (!Arrays.equals(cc, ccO)) {
202 System.out.printf(" decoding(direct) failed%n");
203 }
204
205
206
207 if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
208 System.out.printf(" String.toCharArray() failed");
209 }
210 }
211
212 static void compare(String csn1, String csn2) throws Exception {
213 System.out.printf(" Diff <%s> <%s>...%n", csn1, csn2);
214 char[] cc = getUTFChars();
215
216 byte[] bb1 = encode(cc, csn1, false);
217 byte[] bb2 = encode(cc, csn2, false);
218 if (!Arrays.equals(bb1, bb2))
219 System.out.printf(" encoding failed%n");
220 char[] cc1 = decode(bb1, csn1, false);
221 char[] cc2 = decode(bb1, csn2, false);
222 if (!Arrays.equals(cc1, cc2)) {
223 System.out.printf(" decoding failed%n");
224 }
225
226 bb1 = encode(cc, csn1, true);
227 bb2 = encode(cc, csn2, true);
228 if (!Arrays.equals(bb1, bb2))
229 System.out.printf(" encoding (direct) failed%n");
230 cc1 = decode(bb1, csn1, true);
231 cc2 = decode(bb1, csn2, true);
232 if (!Arrays.equals(cc1, cc2)) {
233 System.out.printf(" decoding (direct) failed%n");
234 }
235 }
236
237
238 static byte[][] malformed = {
239
240 {1, (byte)0xFF },
241 {1, (byte)0xC0 },
242 {1, (byte)0x80 },
243
244 {1, (byte)0xFF, (byte)0xFF},
245 {1, (byte)0xA0, (byte)0x80},
246
247
248 {1, (byte)0xC0, (byte)0x80},
249 {1, (byte)0xC1, (byte)0xBF},
250 {1, (byte)0xC2, (byte)0x00},
251 {1, (byte)0xC2, (byte)0xC0},
252 {1, (byte)0xD0, (byte)0x00},
253 {1, (byte)0xD0, (byte)0xC0},
254 {1, (byte)0xDF, (byte)0x00},
255 {1, (byte)0xDF, (byte)0xC0},
256
257
258 {1, (byte)0xE0, (byte)0x80, (byte)0x80},
259 {1, (byte)0xE0, (byte)0x80, (byte)0x80 },
260 {1, (byte)0xE0, (byte)0x81, (byte)0xBF },
261 {1, (byte)0xE0, (byte)0x9F, (byte)0xBF },
262
263 {1, (byte)0xE0, (byte)0xC0, (byte)0xBF },
264 {2, (byte)0xE0, (byte)0xA0, (byte)0x7F },
265 {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 },
266 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF },
267 {1, (byte)0xE0, (byte)0xC0, (byte)0x80 },
268 {1, (byte)0xE0, (byte)0x80, (byte)0xC0 },
269
270
271 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 },
272 {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF },
273 {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF },
274 {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF },
275
276 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF },
277 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},
278 {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 },
279 {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 },
280 {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 },
281
282 {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 },
283 {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 },
284 {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 },
285 {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 },
286 {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 },
287 {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 },
288
289
290 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},
291 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
292 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF },
293 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF },
294 {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF },
295
296 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
297 {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
298 {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
299 {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
300
301
302 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
303 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF },
304 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF },
305 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF },
306 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
307 {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
308 {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },
309 {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 },
310 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 },
311 };
312
313 static void checkMalformed(String csn) throws Exception {
314 boolean failed = false;
315 System.out.printf(" Check malformed <%s>...%n", csn);
316 Charset cs = Charset.forName(csn);
317 for (boolean direct: new boolean[] {false, true}) {
318 for (byte[] bins : malformed) {
319 int mlen = bins[0];
320 byte[] bin = Arrays.copyOfRange(bins, 1, bins.length);
321 CoderResult cr = decodeCR(bin, csn, direct);
322 String ashex = "";
323 for (int i = 0; i < bin.length; i++) {
324 if (i > 0) ashex += " ";
325 ashex += Integer.toBinaryString((int)bin[i] & 0xff);
326 }
327 if (!cr.isMalformed()) {
328 System.out.printf(" FAIL(direct=%b): [%s] not malformed.%n", direct, ashex);
329 failed = true;
330 } else if (cr.length() != mlen) {
331 System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].%n", direct, ashex, cr.length());
332 failed = true;
333 }
334 if (!Arrays.equals(decode(cs, bin, 0, bin.length),
335 new String(bin, csn).toCharArray())) {
336 System.out.printf(" FAIL(new String(bb, %s)) failed%n", csn);
337 failed = true;
338 }
339 }
340 }
341 if (failed)
342 throw new RuntimeException("Check malformed failed " + csn);
343 }
344
345 static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) {
346 int inPos = flow[0];
347 int inLen = flow[1];
348 int outPos = flow[2];
349 int outLen = flow[3];
350 int expedInPos = flow[4];
351 int expedOutPos = flow[5];
352 CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW
353 :CoderResult.OVERFLOW;
354 ByteBuffer bbf;
355 CharBuffer cbf;
356 if (direct) {
357 bbf = ByteBuffer.allocateDirect(inPos + utf8s.length);
358 cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer();
359 } else {
360 bbf = ByteBuffer.allocate(inPos + utf8s.length);
361 cbf = CharBuffer.allocate(outPos + outLen);
362 }
363 bbf.position(inPos);
364 bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen);
365 cbf.position(outPos);
366 dec.reset();
367 CoderResult cr = dec.decode(bbf, cbf, false);
368 if (cr != expedCR ||
369 bbf.position() != expedInPos ||
370 cbf.position() != expedOutPos) {
371 System.out.printf("Expected(direct=%5b): [", direct);
372 for (int i:flow) System.out.print(" " + i);
373 System.out.println("] CR=" + cr +
374 ", inPos=" + bbf.position() +
375 ", outPos=" + cbf.position());
376 return false;
377 }
378 return true;
379 }
380
381 static void checkUnderOverflow(String csn) throws Exception {
382 System.out.printf(" Check under/overflow <%s>...%n", csn);
383 CharsetDecoder dec = Charset.forName(csn).newDecoder();
384 boolean failed = false;
385 byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8");
386 int inlen = utf8s.length;
387
388 for (int inoff = 0; inoff < 20; inoff++) {
389 for (int outoff = 0; outoff < 20; outoff++) {
390 int[][] Flows = {
391
392 {inoff, inlen, outoff, 1, inoff + 1, outoff + 1, 1},
393 {inoff, inlen, outoff, 2, inoff + 3, outoff + 2, 1},
394 {inoff, inlen, outoff, 3, inoff + 6, outoff + 3, 1},
395 {inoff, inlen, outoff, 4, inoff + 6, outoff + 3, 1},
396 {inoff, inlen, outoff, 5, inoff + 10,outoff + 5, 0},
397
398 {inoff, 1, outoff, 5, inoff + 1, outoff + 1, 0},
399 {inoff, 2, outoff, 5, inoff + 1, outoff + 1, 0},
400 {inoff, 3, outoff, 5, inoff + 3, outoff + 2, 0},
401 {inoff, 4, outoff, 5, inoff + 3, outoff + 2, 0},
402 {inoff, 5, outoff, 5, inoff + 3, outoff + 2, 0},
403 {inoff, 6, outoff, 5, inoff + 6, outoff + 3, 0},
404 {inoff, 7, outoff, 5, inoff + 6, outoff + 3, 0},
405 {inoff, 8, outoff, 5, inoff + 6, outoff + 3, 0},
406 {inoff, 9, outoff, 5, inoff + 6, outoff + 3, 0},
407 {inoff, 10, outoff, 5, inoff + 10,outoff + 5, 0},
408
409 {inoff, 2, outoff, 1, inoff + 1, outoff + 1, 0},
410 {inoff, 3, outoff, 1, inoff + 1, outoff + 1, 1},
411
412 {inoff, 4, outoff, 2, inoff + 3, outoff + 2, 0},
413 {inoff, 5, outoff, 2, inoff + 3, outoff + 2, 0},
414 {inoff, 6, outoff, 2, inoff + 3, outoff + 2, 1},
415
416 {inoff, 7, outoff, 4, inoff + 6, outoff + 3, 0},
417 {inoff, 8, outoff, 4, inoff + 6, outoff + 3, 0},
418 {inoff, 9, outoff, 4, inoff + 6, outoff + 3, 0},
419 {inoff, 10, outoff, 4, inoff + 6, outoff + 3, 1},
420 };
421 for (boolean direct: new boolean[] {false, true}) {
422 for (int[] flow: Flows) {
423 if (!check(dec, utf8s, direct, flow))
424 failed = true;
425 }
426 }}}
427 if (failed)
428 throw new RuntimeException("Check under/overflow failed " + csn);
429 }
430
431 public static void main(String[] args) throws Exception {
432 checkRoundtrip("UTF-8");
433 check6ByteSurrs("UTF-8");
434
435 checkMalformed("UTF-8");
436 checkUnderOverflow("UTF-8");
437 }
438 }